Synchronize after every op

jart · jart · commit 9cf736355383 · 2024-04-29T20:35:39.000-07:00
The previous commit loosened the synchronization barriers too much, and
it caused some rare brain damage when using quants. With this change,
we're now only using 3-4x fewer sychronization barriers than before.

A new --trap flag has been introduced by this change, which enables the
runtime to detect the precise moment a NaN appears anywhere in the code
and then aborts with both a C++ backtrace and a GGML graph trace.

Some jankiness with pledge() and /dev/tty is also fixed by this change.
diff --git a/build/config.mk b/build/config.mk
@@ -13,8 +13,8 @@ MKDEPS = $(COSMOCC)/bin/mkdeps
 INSTALL = install
 
 ARFLAGS = rcsD
-CCFLAGS = -g -O3 -fexceptions
-CPPFLAGS_ = -iquote. -mcosmo -DGGML_MULTIPLATFORM -Wno-attributes
+CCFLAGS = -g -O3 -fexceptions -fsignaling-nans
+CPPFLAGS_ = -iquote. -mcosmo -DGGML_MULTIPLATFORM -Wno-attributes -DLLAMAFILE_DEBUG
 TARGET_ARCH = -Xx86_64-mavx -Xx86_64-mtune=znver4
 
 TMPDIR = o//tmp
diff --git a/llama.cpp/common.cpp b/llama.cpp/common.cpp
@@ -233,6 +233,12 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
     if (arg == "--cli") {
         return true;
     }
+    if (arg == "--trap") {
+        FLAG_trap = true;
+        FLAG_unsecure = true; // for better backtraces
+        llamafile_trapping_enabled(+1);
+        return true;
+    }
     if (arg == "--unsecure") {
         FLAG_unsecure = true;
         return true;
diff --git a/llama.cpp/console.cpp b/llama.cpp/console.cpp
@@ -1,13 +1,15 @@
 // -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;tab-width:8;coding:utf-8 -*-
 // vi: set et ft=c++ ts=4 sts=4 sw=4 fenc=utf-8 :vi
+
 #include "console.h"
+
 #include <vector>
 #include <iostream>
-
 #include <climits>
 #include <sys/ioctl.h>
 #include <unistd.h>
 #include <wchar.h>
+#include <cosmo.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <signal.h>
@@ -30,6 +32,7 @@ namespace console {
 
 static bool      advanced_display = false;
 static bool      simple_io        = true;
+static bool      should_close_tty = false;
 static display_t current_display  = reset;
 static FILE*     out              = stdout;
 static FILE*     tty              = nullptr;
@@ -40,19 +43,32 @@ static termios   initial_state;
 //
 
 void init(bool use_simple_io, bool use_advanced_display) {
-    advanced_display = use_advanced_display;
+    should_close_tty = false;
     simple_io = use_simple_io;
+    advanced_display = use_advanced_display;
     if (!simple_io) {
-        struct termios new_termios;
-        tcgetattr(STDIN_FILENO, &initial_state);
-        new_termios = initial_state;
-        new_termios.c_lflag &= ~(ICANON | ECHO);
-        new_termios.c_cc[VMIN] = 1;
-        new_termios.c_cc[VTIME] = 0;
-        tcsetattr(STDIN_FILENO, TCSANOW, &new_termios);
         tty = fopen("/dev/tty", "w+e");
+        if (tty) {
+            should_close_tty = true;
+        } else if (IsLinux() || IsOpenbsd()) {
+            // this could happen because pledge() blocked us
+            tty = fdopen(0, "w+e");
+        }
         if (tty != nullptr) {
-            out = tty;
+            if (!tcgetattr(fileno(tty), &initial_state)) {
+                out = tty;
+                struct termios new_termios = initial_state;
+                new_termios.c_lflag &= ~(ICANON | ECHO);
+                new_termios.c_cc[VMIN] = 1;
+                new_termios.c_cc[VTIME] = 0;
+                tcsetattr(fileno(tty), TCSANOW, &new_termios);
+            } else {
+                simple_io = true;
+                fclose(tty);
+                tty = 0;
+            }
+        } else {
+            simple_io = true;
         }
     }
     setlocale(LC_ALL, "");
@@ -64,11 +80,14 @@ void cleanup() {
     // Restore settings
     if (!simple_io) {
         if (tty != nullptr) {
-            out = stdout;
-            fclose(tty);
+            fflush(tty);
+            tcsetattr(fileno(tty), TCSANOW, &initial_state);
+            if (should_close_tty) {
+                fclose(tty);
+            }
             tty = nullptr;
+            out = stdout;
         }
-        tcsetattr(STDIN_FILENO, TCSANOW, &initial_state);
     }
 }
 
diff --git a/llama.cpp/ggml.c b/llama.cpp/ggml.c
@@ -117,9 +117,9 @@ void ggml_print_backtrace(void) {
 
 /*#define GGML_PERF*/
 #define GGML_DEBUG 0
-#define GGML_GELU_FP16
-#define GGML_GELU_QUICK_FP16
-#define GGML_SILU_FP16
+// #define GGML_GELU_FP16
+// #define GGML_GELU_QUICK_FP16
+// #define GGML_SILU_FP16
 // #define GGML_CROSS_ENTROPY_EXP_FP16
 // #define GGML_FLASH_ATTN_EXP_FP16
 
@@ -2679,6 +2679,7 @@ static inline int ggml_up(int n, int m) {
 struct ggml_context * ggml_init(struct ggml_init_params params) {
     // make this function thread safe
     ggml_critical_section_start();
+    llamafile_trapping_enabled(-1);
 
     static bool is_first_call = true;
 
@@ -2750,6 +2751,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
     if (ctx == NULL) {
         GGML_PRINT_DEBUG("%s: no unused context found\n", __func__);
 
+        llamafile_trapping_enabled(+1);
         ggml_critical_section_end();
 
         return NULL;
@@ -2781,6 +2783,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
 
     GGML_PRINT_DEBUG("%s: context initialized\n", __func__);
 
+    llamafile_trapping_enabled(+1);
     ggml_critical_section_end();
 
     return ctx;
@@ -7105,19 +7108,12 @@ void ggml_syncthreads(struct ggml_compute_params * params) {
 // ops must call this before accessing wdata
 // otherwise overlapping threads on previous op might clobber
 void *ggml_acquire_wdata(struct ggml_compute_params * params) {
-    if (params->limbo) {
-        ggml_syncthreads(params);
-    }
-    params->limbo = true;
     return params->wdata;
 }
 
 // ops should call this after writing to wdata before reading
 void ggml_release_wdata(struct ggml_compute_params * params) {
-    if (params->limbo) {
-        ggml_syncthreads(params);
-        params->limbo = false;
-    }
+    ggml_syncthreads(params);
 }
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -11531,12 +11527,9 @@ static void ggml_compute_forward_set_f32(
 
     if (!inplace) {
         if (!params->ith) {
-            // memcpy needs to be synchronized across threads to avoid race conditions.
-            // => do it in INIT phase
-            memcpy(
-                ((char *)  dst->data),
-                ((char *) src0->data),
-                ggml_nbytes(dst));
+            memcpy(((char *)  dst->data),
+                   ((char *) src0->data),
+                   ggml_nbytes(dst));
         }
         ggml_syncthreads(params);
     }
@@ -17759,35 +17752,6 @@ struct ggml_compute_state {
     enum ggml_status ec;
 };
 
-// returns true if `src` is direct dependency of `node`
-static bool ggml_has_src(const struct ggml_tensor * node,
-                         const struct ggml_tensor * src) {
-    for (int i = 0; i < GGML_MAX_SRC; ++i) {
-        if (node->src[i] == src) {
-            return true;
-        }
-    }
-    return false;
-}
-
-// returns true if `dest` depends on any outputs since `mark`
-//
-//   - cgraph->nodes[dest] is about to be executed
-//   - syncthreads() last happened right before cgraph->nodes[mark] executed
-//   - syncthreads() has 1.8 µs overhead minimum on Apple M2 when nth == 12
-//
-static bool ggml_needs_barrier(const struct ggml_cgraph * cgraph, int mark, int dest) {
-    assert(mark >= 0);
-    assert(mark <= dest);
-    assert(dest < cgraph->n_nodes);
-    for (; mark < dest; ++mark) {
-        if (ggml_has_src(cgraph->nodes[dest], cgraph->nodes[mark])) {
-            return true;
-        }
-    }
-    return false;
-}
-
 static thread_ret_t ggml_graph_compute_thread(void * data) {
     struct ggml_compute_state * state  = (struct ggml_compute_state *) data;
     const struct ggml_cgraph  * cgraph = state->shared->cgraph;
@@ -17799,11 +17763,16 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
         /*.wsize   =*/ cplan->work_size,
         /*.wdata   =*/ cplan->work_data,
         /*.barrier =*/ &state->shared->barrier,
-        /*.limbo   =*/ false,
     };
 
     set_numa_thread_affinity(state->ith);
 
+#ifdef LLAMAFILE_DEBUG
+    if (FLAG_trap) {
+        llamafile_trapping_enabled(+1);
+    }
+#endif
+
 #ifdef GGML_PERF
     int64_t start_cycles, start_time_us;
     if (!state->ith) {
@@ -17819,17 +17788,19 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
             return 0;
         }
 
-        if (ggml_needs_barrier(cgraph, mark, node_n)) {
-            ggml_syncthreads(&params);
-            mark = node_n;
-        }
+#ifdef LLAMAFILE_DEBUG
+        llamafile_debug_op_index = node_n;
+#endif
 
         struct ggml_tensor *node = cgraph->nodes[node_n];
         params.nth = state->shared->n_threads;
         ggml_compute_forward(&params, node);
 
+        // this barrier could potentially be eliminated in 15%+ of cases
+        // however, it would give rise to ghoulish errors w/ little gain
+        ggml_syncthreads(&params);
+
 #if GGML_PERF
-        ggml_syncthreads(&state->shared->barrier);
         if (!state->ith) {
             int64_t end_cycles  = ggml_perf_cycles();
             int64_t end_time_us = ggml_perf_time_us();
@@ -17844,8 +17815,6 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
 #endif
     }
 
-    ggml_syncthreads(&params);
-
     return 0;
 }
 
@@ -18075,6 +18044,10 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
     };
     struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads);
 
+#ifdef LLAMAFILE_DEBUG
+    llamafile_debug_graph = cgraph;
+#endif
+
     // create thread pool
     if (n_threads > 1) {
         for (int j = 1; j < n_threads; ++j) {
@@ -18134,6 +18107,10 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
 
     // fprintf(stderr, "%6d barriers %6d ops\n", state_shared.barrier.phase, cgraph->n_nodes);
 
+#ifdef LLAMAFILE_DEBUG
+    llamafile_debug_graph = 0;
+#endif
+
     return compute_status;
 }
 
diff --git a/llama.cpp/ggml.h b/llama.cpp/ggml.h
@@ -678,7 +678,6 @@ extern "C" {
         void * wdata;
 
         struct ggml_barrier * barrier;
-        bool limbo;
     };
 
     GGML_API void   ggml_syncthreads  (struct ggml_compute_params *);
diff --git a/llama.cpp/main/main.cpp b/llama.cpp/main/main.cpp
@@ -184,20 +184,6 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
-    if (!params.mmproj.empty() &&
-        (!params.image.empty() ||
-         params.prompt.find("<img src=\"") != std::string::npos)) {
-        return llava_cli(argc, argv, &params);
-    }
-
-    // TODO: Dump params ?
-    //LOG("Params perplexity: %s\n", LOG_TOSTR(params.perplexity));
-
-    // save choice to use color for later
-    // (note for later: this is a slightly awkward choice)
-    console::init(params.simple_io, params.use_color);
-    atexit([]() { console::cleanup(); });
-
     if (!FLAG_unsecure && !llamafile_has_gpu()) {
         // Enable pledge() security on Linux and OpenBSD.
         // - We do this *after* opening the log file for writing.
@@ -213,6 +199,7 @@ int main(int argc, char ** argv) {
         } else {
             promises = "stdio rpath tty";
         }
+        __pledge_mode = PLEDGE_PENALTY_RETURN_EPERM;
         if (pledge(0, 0)) {
             LOG_TEE("warning: this OS doesn't support pledge() security\n");
         } else if (pledge(promises, 0)) {
@@ -221,6 +208,20 @@ int main(int argc, char ** argv) {
         }
     }
 
+    if (!params.mmproj.empty() &&
+        (!params.image.empty() ||
+         params.prompt.find("<img src=\"") != std::string::npos)) {
+        return llava_cli(argc, argv, &params);
+    }
+
+    // TODO: Dump params ?
+    //LOG("Params perplexity: %s\n", LOG_TOSTR(params.perplexity));
+
+    // save choice to use color for later
+    // (note for later: this is a slightly awkward choice)
+    console::init(!params.interactive || params.simple_io, params.use_color);
+    atexit([]() { console::cleanup(); });
+
     if (params.logits_all) {
         printf("\n************\n");
         printf("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
diff --git a/llamafile/debug.cpp b/llamafile/debug.cpp
diff --git a/llamafile/llamafile.h b/llamafile/llamafile.h